library(tidyverse)
## ── Attaching packages ────────
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.1     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ─────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(forcats)
#install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
load('infant.RData')

#infant %>% View()

# NA ----------------------------------------------------------------------
a <- c(NA, 1, 4, NA)
sum(is.na(a))/length(a)
## [1] 0.5
infant %>%
  summarise_all(
    funs(sum(is.na(.))/length(.))
  ) %>%
  gather() %>%
  ggplot(aes(x = key, y = value)) +
  geom_col() +
  coord_flip()

infant %>%
  group_by(date_of_delivery_y) %>%
  summarise_all(
    funs(sum(is.na(.))/length(.))
  ) %>%
  gather(... = -date_of_delivery_y) %>%
  ggplot(aes(x = key, y = value)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~date_of_delivery_y)

infant %>%
  group_by(date_of_delivery_y) %>%
  summarise_all(
    funs(sum(is.na(.))/length(.))
  ) %>%
  gather(... = -date_of_delivery_y) %>%
  ggplot(aes(x = date_of_delivery_y,
             y = value)) +
  geom_line(aes(color = key))

  #ggplotly()
infant %>%
  group_by(race_and_hispanic_orig_of_mother_c4) %>%
  summarise_all(
    funs(sum(is.na(.))/length(.))
  ) %>%
  gather(... = -race_and_hispanic_orig_of_mother_c4) %>%
  ggplot(aes(x = value, y = key,
             color = race_and_hispanic_orig_of_mother_c4)) +
  geom_point()

# Infant mortality --------------------------------------------------------

infant <-
  infant %>%
  mutate(death = ifelse(is.na(age_at_death_d), 0, 1))

infant %>%
  group_by(date_of_delivery_y) %>%
  summarise(imr = sum(death)/n())
## # A tibble: 16 x 2
##    date_of_delivery_y     imr
##                 <int>   <dbl>
##  1               1995 0.00715
##  2               1996 0.00707
##  3               1997 0.00709
##  4               1998 0.00680
##  5               1999 0.00697
##  6               2000 0.00679
##  7               2001 0.00677
##  8               2002 0.00709
##  9               2003 0.00687
## 10               2004 0.00673
## 11               2005 0.00663
## 12               2006 0.00672
## 13               2007 0.00693
## 14               2008 0.00631
## 15               2009 0.00622
## 16               2010 0.00623
# aggregate(death ~ date_of_delivery_y,
#           FUN = function (x) {sum(x)/length(x)},
#           data = infant)

infant %>%
  group_by(date_of_delivery_ym) %>%
  summarise(imr = sum(death)/n()) %>%
  ggplot(aes(x = date_of_delivery_ym, y = imr)) +
  geom_line() +
  geom_smooth()
## `geom_smooth()` using method = 'loess'

infant %>%
  group_by(date_of_delivery_ym, sex) %>%
  summarise(imr = sum(death)/n()) %>%
  ggplot(aes(x = date_of_delivery_ym, y = imr,
             color = sex)) +
  geom_line() +
  geom_smooth()
## `geom_smooth()` using method = 'loess'

infant %>%
  group_by(date_of_delivery_ym, sex) %>%
  summarise(imr = sum(death)/n()) %>%
  ggplot(aes(x = date_of_delivery_ym, y = imr,
             color = sex)) +
  geom_line() +
  geom_smooth()
## `geom_smooth()` using method = 'loess'

infant %>%
  filter(
    !is.na(education_of_mother_c2),
    !is.na(race_and_hispanic_orig_of_mother_c2)
  ) %>%
  group_by(date_of_delivery_y, sex,
           education_of_mother_c2,
           race_and_hispanic_orig_of_mother_c2) %>%
  summarise(imr = sum(death)/n()) %>%
  ggplot(aes(x = date_of_delivery_y, y = imr,
             color = sex)) +
  geom_line() +
  geom_smooth(method = 'lm', se = FALSE) +
  facet_grid(education_of_mother_c2 ~ race_and_hispanic_orig_of_mother_c2)

# Gestation at birth ------------------------------------------------------

infant %>%
  ggplot() +
  geom_histogram(aes(x = gestation_at_delivery_w,
                     y = ..density..)) +
  facet_wrap(~date_of_delivery_y)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 463049 rows containing non-finite values (stat_bin).

a <- 1:6
sum(a>2)/length(a)
## [1] 0.6666667
infant %>%
  group_by(date_of_delivery_y) %>%
  summarise(
    p = sum(gestation_at_delivery_w > 42,
            na.rm = TRUE)/n()) %>%
  ggplot(aes(x = date_of_delivery_y,
             y = p)) +
  geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
infant %>%
  group_by(date_of_delivery_y) %>%
  summarise(
    p = sum(gestation_at_delivery_w < 38,
            na.rm = TRUE)/n()) %>%
  ggplot(aes(x = date_of_delivery_y,
             y = p)) +
  geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
infant %>%
  group_by(date_of_delivery_y) %>%
  summarise(
    p = sum(gestation_at_delivery_w == 39,
            na.rm = TRUE)/n()) %>%
  ggplot(aes(x = date_of_delivery_y,
             y = p)) +
  geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
infant %>%
  group_by(date_of_delivery_y) %>%
  summarise(
    p = sum(gestation_at_delivery_w == 40,
            na.rm = TRUE)/n()) %>%
  ggplot(aes(x = date_of_delivery_y,
             y = p)) +
  geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# Level and shape of infant mortality -------------------------------------

c(diff(c(1, 5, 6, 7, 9)), NA)
## [1]  4  1  1  2 NA
infant %>%
  group_by(date_of_delivery_ym) %>%
  summarise(imr = sum(death)/n(),
            p = sum(age_at_death_d < 7,
                    na.rm = TRUE)/sum(death)) %>%
  mutate(diff_imr = c(diff(imr), NA),
         diff_p = c(diff(p), NA)) %>%
  ggplot() +
  geom_point(aes(x = diff_imr, y = diff_p))
## Warning: Removed 1 rows containing missing values (geom_point).